# Univariate plot showing relationship between predictor and outcome# Creates n bins on x-axis. For numeric var, uses quantile binning # (ggplot2::cut_number()). Else forcats::fct_lump_n() to create n bins. # Adds an explicit bin for missing (NAs). # Uses posterior mean and 95% credible intervals from empirical bayes # (Laplace smoothing) of probability. plot_data <-function(data, var, n =12){# n is the maximum number of bins/levels to use in plot k =5# shrinkage prior (number of prior observations) y =1-data$outcome # set survival as outcome of interest x = data %>%pull({{var}})if(all(is.na(x))) return(ggplot()) mu =mean(y)if(n_distinct(x) <= {{n}}) x =factor(x)if(is.character(x)) x =factor(x)if(is.factor(x)) x =fct_lump_n(x, n = {{n}})if(is.numeric(x) | lubridate::is.POSIXt(x) | lubridate::is.Date(x)){ x = ggplot2::cut_number(x, n = {{n}}) } x =fct_na_value_to_level(x) # convert NAs to explicit leveltibble(x, y) %>%group_by(x) %>%summarize(n =n(), # number in bin/leveln1 =sum(y >0), # number of outcomes of interest in bin/levelp = (n1 + k*mu)/(n + k), # laplace smoothing estimatealpha = (n1 + k*mu),beta = (n-n1 + k*(1-mu)), lower =qbeta(.025, alpha, beta), upper =qbeta(.975, alpha, beta),# se = sqrt(p*(1-p)/n), # confidence intervals based on laplace# lower = pmax(0, p - 2*se),# approx 95% intervals# upper = pmin(1, p + 2*se) ) %>%ggplot(aes(x, p)) +geom_hline(yintercept =mean(y), lty =3) +geom_errorbar(aes(ymin=lower, ymax=upper)) +geom_point() +# labs(x = xlab) + # scale_x_discrete(guide = guide_axis(n.dodge = 2))scale_x_discrete(labels = scales::label_wrap(10)) +labs(x =sym(var), y ="Waitlist Survival") +coord_cartesian(ylim =c(.80, 1))}
Code
library(mgcv)smoother <-function(x, y, method ="gam"){ method =match.arg(method, c("gam", "glm")) missing = (is.na(x) |is.na(y)) x = x[!missing] y = y[!missing]if(n_distinct(x) ==1) return(NULL)if(is.character(x) ||n_distinct(x) <7) x =factor(x) bs =ifelse(is.factor(x), "re", "tp")if(method =="gam"){tibble(x, y) %>% mgcv::gam(y~s(x, bs=bs, k =8), # max of ~k edfmethod ="REML",family =binomial(),data = .) } elseif(method =="glm"){tibble(x, y) %>%glm(y~x,family =binomial(),data = .) } elsestop("`method` is not valid")}
set.seed(2024)cv_folds = rsample::vfold_cv(data_train, v =10, strata = outcome)
Predictive Modeling
Create baseline preprocessing recipe and set predictor variables. All models start with this recipe.
Sets predictors and outcome variables
Converts Diabetes to {Yes, No}
Converts Functional Status to numeric, adds indicator for baby and unknown
Cleans eGFR: removes outliers, code based on kidney risk, binary for eGFR < 60
Creates binary for Albumin < 3
Code
library(tidymodels)base_rec =#: Set formularecipe(outcome ~ ., data =head(data_train)) %>%# step_naomit(outcome) %>% # remove rows with missing outcomestep_mutate(outcome =factor(outcome, levels =c(1, 0)), skip=TRUE) %>%#: Remove variables from all modelsstep_rm(WL_ID_CODE) %>%step_rm(matches("DONCRIT_.+_AGE"), matches("DONCRIT_.+_HGT"), matches("DONCRIT_.+_WGT"), matches("DONCRIT_.+_MILE"), matches("DONCRIT_"), # remove all DONCRIT variables. They are primarily# recorded for a few UNOS REGIONS. # --- not clinical CITIZENSHIP, HEMODYNAMICS_CO, # lots of missing CEREB_VASC, # --- CAND_DIAG_LISTING, # use CAND_DIAG instead CAND_DIAG_CODE, # use CAND_DIAG instead MOST_RCNT_CREAT, # use eGFR instead VAD_DEVICE_TY_TCR, # not enough info WL_DT, # use LIST_YR for temporal information#------------------------------------- Substitutes# LC_effect, median_wait_days, # use LC_effect instead median_wait_days_1A, # use LC_effect instead median_wait_days_STATUS,# use LC_effect instead median_refusals_old, # use median_refusals instead mean_refusals, # use median_refusals instead p_refusals, # use median_refusals instead#-------------------------------------# LISTING_CTR_CODE, # Let individual models choose REGION, # Some regions only have 1-2 centers## Requested by Dr. Haregu LIFE_SUPPORT_OTHER, PGE_TCR, LIST_YR, ) %>%#: Additional cleaningstep_mutate(# convert Diabetes to Yes = 1, No = 0DIAB =case_match(DIAB, "None"~ 0L, "Unknown"~ 0L, .default = 1L) ) %>%#: Convert Functional status to numeric; add indicators for missing and babystep_mutate(FUNC_STAT_NUM =str_extract(FUNC_STAT_CAND_REG, "(\\d+)%", group =1) %>%as.numeric() %>%coalesce(0),FUNC_STAT_UNKNOWN =ifelse(FUNC_STAT_CAND_REG =="Unknown", 1L, 0L),CAND_UNDER_1 =ifelse(FUNC_STAT_CAND_REG =="Not Applicable (patient < 1 year old)", 1L, 0L), ) %>%step_rm(FUNC_STAT_CAND_REG) %>%# remove the original variable#: Cutoffs for eGFR (Kidney Disease) and Albumin (Nutrition)step_mutate(eGFR =pmin(eGFR, 250), # fix outliers for non-tree modelseGFR_CODED =case_when( eGFR >120~0, eGFR >=90~1, eGFR >=60~2, eGFR >=45~3, eGFR >=30~3.5, eGFR >=15~4, eGFR <15~5, .default =0), # if missing, assume eGFR is goodeGFR_UNDER_60 =ifelse(eGFR <60, 1, 0) %>%coalesce(0), # if missing, assume eGFR is good (above 60)ALBUM_UNDER_3 =ifelse(TOT_SERUM_ALBUM <3, 1, 0) %>%coalesce(0) # if missing, assume Albumin is good (above 3) )#: outliers in median refusals# step_mutate(median_refusals = pmin(median_refusals, 20))
Logistic Regression
1. Tidymodels specification
Lasso logistic regression model.
Remove LISTING_CTR_CODE
Add new missing indicator feature for all variables with missing
Convert Functional Status to number {0, 10, …, 100}. Add indicator for Unknown status.
One-hot encode all categorical predictors. For variables with {Yes, No, Unknown}, only keep the Yes column. This lumps the Unknown with No.
Truncate median_refusals to 20.
Impute all missing values with median.
Create new features by coding eGFR into stages of chronic kidney failure.
Create binary TOT_SERUM_ALBUM < 3 indicator.
Add polynomial terms for the numeric features.
Code
library(tidymodels)# Model specification: Lasso penalized logistic regressionlasso_spec =logistic_reg() %>%set_engine("glmnet") %>%set_args(mixture =1, # 1 = lasso, 0 = ridgepenalty =tune() ) # Recipe:lasso_rec = base_rec %>%#: Remove additional variables for this modelstep_rm( LISTING_CTR_CODE, ) %>%#: Add additional variables to represent missing predictorsstep_indicate_na(all_predictors()) %>%#: Convert categorical predictors to dummy step_dummy(all_nominal_predictors(), one_hot =TRUE) %>%# one-hotstep_dummy(all_ordered_predictors(), one_hot =TRUE) %>%# one-hotstep_rm(ends_with("_No")) %>%# removes the No level (Yes is binary)step_rm(ends_with("_Unknown")) %>%# removes the Unknown level (Yes is binary).# This effectively treats "Unknown" same as "No"#: Impute missing valuesstep_impute_median(all_numeric_predictors()) %>%#: Add polynomial termsstep_poly( AGE, WEIGHT_KG, HEIGHT_CM, BMI, BSA, eGFR,# TOT_SERUM_ALBUM, FUNC_STAT_NUM,# pedhrtx_prev_yr, # median_refusals,# LC_effectdegree =2, ) %>%#: Remove all zero variance predictors (e.g., from step_indicate_na() )step_zv(all_predictors()) %>%#: Scale all predictors for variable importance scoringstep_scale(all_numeric_predictors()) # Workflow:lasso_wflow =workflow(preprocessor = lasso_rec, spec = lasso_spec)
# Model specification: Random Forestrf_spec =rand_forest() %>%set_mode("classification") %>%set_engine("ranger", seed =2024, importance ="impurity", #"none", num.threads =8 ) %>%set_args(mtry =tune(), trees =2000, min_n =2 ) # Recipe:rf_rec = base_rec %>%#: Remove additional variables for this modelstep_rm( LISTING_CTR_CODE, ) %>%#: Add additional variables to represent missing predictorsstep_indicate_na(all_predictors()) %>%#: Convert categorical predictors to dummy step_dummy(all_nominal_predictors(), one_hot =TRUE) %>%# one-hot# step_dummy(all_ordered_predictors(), one_hot = TRUE) %>% # one-hotstep_rm(ends_with("_No")) %>%# removes the No level (Yes is binary)step_rm(ends_with("_Unknown")) %>%# removes the Unknown level (Yes is binary).# This effectively treats "Unknown" same as "No"#: Impute missing valuesstep_impute_median(all_numeric_predictors()) %>%#: Remove all zero variance predictors (e.g., from step_indicate_na() )step_zv(all_predictors())# Workflow:rf_wflow =workflow(preprocessor = rf_rec, spec = rf_spec)
2. Tune mtry
Use OOB observations for tuning instead of cross-validation.
Code
# pre-tuning: use oob brier score to seed the full cross-val grid search# this is used to speed along tune_grid() oob_brier <-function(mtry){# according to help(ranger), brier metric is used for oob error fit = rf_wflow %>%finalize_workflow(list(mtry = mtry)) %>%fit(data_train) tibble( mtry, oob_error = fit %>%extract_fit_engine() %>%pluck("prediction.error") )}num_cols =prep(rf_rec, data_train)$term_info %>%nrow() # number of featuresmtry_max =min(num_cols, 2*sqrt(num_cols)) %>%floor() # max mtry to trymtry_oob_grid =seq(1, mtry_max, length=50) %>%floor() %>%unique()oob_perf =map_df(mtry_oob_grid, oob_brier) # get oob performancemtry_grid = oob_perf %>%slice_min(oob_error, n =5) %>%# keep best 5 mtry valuesselect(mtry)
Fixing learn_rate(eta) to 0.10, sample_size = 0.80, and tuning tree_depth and trees. Fixing learn_rate and tuning the number of trees is good for efficiency due to the multi-predict capabilities.
Remove LISTING_CTR_CODE
One-hot encode all nominal predictors. For variables with {Yes, No, Unknown}, only keep the Yes column. This lumps the Unknown with No.
Let xgboost handle missing values
Code
library(xgboost)library(tidymodels)# Model specification: XGBoostbt_spec =boost_tree() %>%set_mode("classification") %>%set_engine("xgboost", nthread =8) %>%set_args(trees =tune(), tree_depth =tune(),learn_rate =0.10, # fixed sample_size = .80, # fixed ) # Recipe:## Let xgboost handle missing values internally; does not imputebt_rec = base_rec %>%step_rm( LISTING_CTR_CODE, ) %>%# step_dummy(LISTING_CTR_CODE, one_hot = TRUE) %>% #: Convert categorical predictors to dummy step_dummy(all_nominal_predictors(), one_hot =TRUE) %>%# one-hot# step_dummy(all_ordered_predictors(), one_hot = TRUE) %>% # one-hotstep_rm(ends_with("_No")) %>%# removes the No level (Yes is binary)step_rm(ends_with("_Unknown")) %>%# removes the Unknown level (Yes is binary).# This effectively treats "Unknown" same as "No"#: Remove all zero variance predictors (e.g., from step_indicate_na() )step_zv(all_predictors()) # Workflow:bt_wflow =workflow(preprocessor = bt_rec, spec = bt_spec)
2. Tuning
Code
# Tuning grid. Use fewer trees from larger depthbt_grid =bind_rows(tibble(trees =seq(25, 300, by =5), tree_depth =1), tibble(trees =seq(10, 150, by =5), tree_depth =2),tibble(trees =seq(10, 75, by =5), tree_depth =3), tibble(trees =seq(10, 50, by =5), tree_depth =4), )# expand_grid(trees = seq(25, 250, by = 10), tree_depth = 1:4)#: don't use bayes here since it won't exploit the multi-predict efficiencyset.seed(1000)tune_res =tune_grid(object = bt_wflow,resamples = cv_folds,grid = bt_grid,metrics =metric_set(roc_auc, brier_class, mn_log_loss, accuracy), control =control_grid(verbose=FALSE))
3. CV performance
Code
tune_res %>%show_best(metric ="roc_auc")
Code
tune_res %>%collect_metrics() %>%filter(.metric =="roc_auc") %>%arrange(trees) %>%ggplot(aes(trees, mean, color =factor(tree_depth))) +geom_point() +geom_line() +labs(x ="Number of Trees", y ="Avg AUC", color ="tree depth")
4. Final model fit
Code
# Final model fit(bt_tune = tune_res %>%select_best(metric ="roc_auc"))
features =prep(base_rec, data_train) %>%bake(new_data=NULL, all_predictors()) %>%colnames() %>%intersect(colnames(data_train))plot_multi_effects <-function(var){ var = rlang::ensym(var) df =bind_rows(lasso =get_additive_effects(var, model = lasso_fit, data=data_train),xgboost =get_additive_effects(var, model = bt_fit, data=data_train), .id ="model" ) %>%rename(x =!!var, y = eta) rug_data = df %>%filter(!is.na(n)) n_bks =nrow(rug_data) categorical =is.character(df$x) |is.factor(df$x)if(n_bks >6&!categorical ){# plt = plot_numeric_effects(df[[1]], df[[2]], var, rug_data) plt =ggplot(df) +geom_hline(yintercept =0, color ="orange") +geom_line(aes(x, y, color=model)) } else{# plt = plot_categorical_effects(df[[1]], df[[2]], var, rug_data) rug_data = rug_data %>%mutate(x=as.factor(x)) plt =ggplot(rug_data) +geom_hline(yintercept =0, color ="orange") +geom_col(aes(x, y, fill = model), width =1/3, position ="dodge") +scale_x_discrete(label = scales::label_wrap(15)) }print( plt +geom_rug(data = rug_data %>%distinct(x,n), aes(x, linewidth = n), show.legend =FALSE,sides ="b", alpha = .25) +labs(x =as.character(var), y ="partial effect") +scale_y_continuous(breaks =seq(-10, 10, by = .25)) +scale_color_brewer(type ="qual", palette ="Dark2") +scale_fill_brewer(type ="qual", palette ="Dark2") +coord_cartesian(ylim =c(-1,1)) +theme(legend.position =c(0.02, 0.98),legend.justification =c("left", "top"),legend.title=element_blank() ) )}walk(features, plot_multi_effects)
Feature Importance
The feature importance metric is the mean absolute effect (like shap importance).
Code
# Mean Absolute Effectfeature_importance <-function(var){ var = rlang::ensym(var) df =bind_rows(lasso =get_additive_effects(var, model = lasso_fit, data=data_train),xgboost =get_additive_effects(var, model = bt_fit, data=data_train), .id ="model" ) %>%rename(x =!!var, y = eta) %>%filter(!is.na(n)) %>%group_by(model) %>%summarize(Importance =sum(n*abs(y))/sum(n))}map(set_names(features), feature_importance) %>%bind_rows(.id ="Variable") %>%mutate(Variable =fct_reorder(Variable, Importance, .fun ="mean")) %>%ggplot(aes(x=Importance, y=Variable, fill = model)) +geom_col(position ="dodge") +scale_fill_brewer(type ="qual", palette ="Dark2")
Center Level Performance
This shows the predictive bias: test survival - predicted survival, by listing center. The top centers have the largest volume (n). Removed centers with \(n \le 2\).
As an example, the third from the top center (19468) has a bias of about -0.10. This means that the actual survival in this center was 10% lower than the models’ predicted.
While I’m not too concerned about the bias in the low volume centers (bottom on plot), there does appear to be modest unaccounted center effects.